1. ggplot2 points
library(dplyr)
library(ggplot2)
# 1. max. vol and price
diamonds %>%
mutate(theoretical_max_vol = x * y * z) %>%
ggplot(aes(x = theoretical_max_vol, y = price)) + geom_point()

## we have an outlier, removing it
diamonds %>%
mutate(theoretical_max_vol = x * y * z) %>%
filter(theoretical_max_vol < 3000) %>%
ggplot(aes(x = theoretical_max_vol, y = price)) + geom_point()

# 2. aesthetics options
# taking only a smaller subset
set.seed(123)
diamond_df <- diamonds %>% sample_n(1000)
ggplot(diamond_df, aes(x = carat, y = price)) + geom_point()

ggplot(diamond_df, aes(x = carat, y = price, color = cut)) + geom_point()

ggplot(diamond_df, aes(x = carat, y = price, color = color)) + geom_point()

ggplot(diamond_df, aes(x = carat, y = price, color = color, shape = cut)) + geom_point()

ggplot(diamond_df, aes(x = carat, y = price)) + geom_point(alpha = 0.5)

# 3. assign plot to a variable
plot1 <- ggplot(diamond_df, aes(x = carat, y = price)) + geom_point()
plot1 + ggtitle("This is a title")

ggsave("plot1.pdf", plot1, width = 7, height = 5)
ggsave("plot1.png", plot1, width = 7, height = 5)
Explain the differences:
# 1. data and aesthetics are set globally (at least for this plot)
ggplot(df, aes(x = time, y = value, color = symbol)) +
geom_line()
# 2. data is set globally (this plot), aesthetics are set only for geom_line()
ggplot(df) +
geom_line(aes(x = time, y = value, color = symbol))
# 3. data and aesthetics are set only for geom_line()
ggplot() +
geom_line(data = df, aes(x = time, y = value, color = symbol))
# 7: Yes, the code would work, data, x, and color are set globally (for the plot), y is set for geom_line() only
ggplot(df, aes(x = time, color = symbol)) +
geom_line(aes(y = value))
2. Geometrics
2D Random Walk
# 1. create some data
set.seed(123)
rwalk <- tibble(
id = 1:10000,
x = cumsum(rnorm(10000)),
y = cumsum(rnorm(10000))
)
ggplot(rwalk, aes(x = x, y = y, color = id)) + geom_path()

stock returns
library(tidyquant)
dow_constituents <- tq_index("dow-jones")
dow_stocks <- tq_get(dow_constituents$symbol, from = "2010-01-01", to = "2015-12-31")
dow_df <- left_join(dow_stocks, dow_constituents, by = "symbol")
# plot the lines individually
ggplot(dow_df, aes(x = date, y = adjusted, color = company)) +
geom_line() +
theme(legend.position = "none") # remove legend...

# indexed plot
dow_df %>%
group_by(company) %>%
mutate(idx_price = adjusted / adjusted[1] * 100) %>%
ggplot(aes(x = date, y = idx_price, color = company)) +
geom_line() +
theme(legend.position = "none") # remove legend...

# density plot of returns
dow_df %>%
group_by(company) %>%
mutate(returns = adjusted / lag(adjusted) - 1) %>%
ggplot(aes(x = returns, fill = company)) +
geom_density(alpha = 0.1) +
scale_x_continuous(limits = c(-0.1, 0.1)) + # limit the display to -0.1 til 0.1
theme(legend.position = "none") # remove legend...

finance market structure
goog_df <- tq_get("GOOG", from = "2016-01-01", to = "2016-12-31")
ggplot(goog_df, aes(x = date, y = adjusted)) +
geom_ribbon(aes(ymin = low, ymax = high), fill = "orange") +
geom_line()

ggplot(goog_df, aes(x = date, y = adjusted)) +
geom_ribbon(aes(ymin = low, ymax = high), fill = "lightgreen") +
geom_step()

World Bank
library(wbstats)
wb_df <- wb(indicator = "SP.POP.TOTL", startdate = 2000, enddate = 2016)
wb_selected_df <- wb_df %>%
filter(country %in% c("Germany", "European Union", "France", "United States")) %>%
mutate(date = as.numeric(date))
ggplot(wb_selected_df, aes(x = date, y = value, color = country)) + geom_line()

wb_selected_df %>%
group_by(country) %>%
arrange(date) %>%
mutate(diff = value / lag(value) - 1) %>%
ggplot(aes(x = date, y = diff, color = country)) + geom_boxplot()
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).

Advanced Gapminder
library(gapminder)
gapminder %>%
filter(year == 2007) %>%
ggplot(aes(x = gdpPercap, y = lifeExp, color = continent, size = pop)) +
geom_point() +
scale_x_log10()

3. Functions
# 1. trigonometry
fun1 <- function(x) sin(x + 1) + 3
fun2 <- function(x) cos(-x)* 3
ggplot(tibble(x = -10:10), aes(x = x)) +
stat_function(fun = fun1, n = 1000, color = "blue") +
stat_function(fun = fun2, n = 1000, color = "red") +
scale_y_continuous(limits = c(-10, 10))

# 2. distributions
ggplot(tibble(x = -3:3), aes(x = x)) +
stat_function(fun = dnorm, aes(color = "normal")) +
stat_function(fun = dunif, aes(color = "uniform")) +
stat_function(fun = dt, aes(color = "t"), args = list(df = 2)) +
stat_function(fun = df, aes(color = "F"), args = list(df1 = 1, df2 = 20), n = 1000) +
stat_function(fun = dchisq, aes(color = "chisq"), args = list(df = 2), n = 1000) +
scale_y_continuous(limits = c(0, 3))

# capm
capm_upper <- function(sd) {
div <- 147.8/4037.6
x <- div + sqrt(div^2 - (6 - 2339.9 * sd^2) / 4037.6)
return(x)
}
capm_lower <- function(sd) {
div <- 147.8/4037.6
x <- div - sqrt(div^2 - (6 - 2339.9 * sd^2) / 4037.6)
return(x)
}
ggplot(tibble(sd = c(0, 0.03)), aes(x = sd)) +
stat_function(fun = capm_upper, n = 10000, aes(color = "upper")) +
stat_function(fun = capm_lower, n = 10000, aes(color = "lower")) +
labs(title = "The Efficient Frontier", caption = "Source: DataShenanigans",
x = "Risk", y = "Expected Returns", color = "Efficient\nFrontier")
## Warning in sqrt(div^2 - (6 - 2339.9 * sd^2)/4037.6): NaNs produced
## Warning in sqrt(div^2 - (6 - 2339.9 * sd^2)/4037.6): NaNs produced
## Warning: Removed 5291 rows containing missing values (geom_path).
## Warning: Removed 5291 rows containing missing values (geom_path).

4. Colors
# 1. Diamond plot
set.seed(42)
diamond_df <- diamonds %>% sample_n(1000)
diamond_plot <- ggplot(diamond_df, aes(x = carat, y = price, color = cut)) +
geom_point()
diamond_plot

diamond_plot + scale_color_brewer(palette = "Dark2")

# 2. skipped
# 3. colors are not displayed properly because we map the "drv" to fill, but specify color (usage of "scale_fill_manual" instead of "scale_color_manual" would resolve the issue)
5. Annotations
library(ggrepel)
ggplot(filter(mpg, manufacturer == "audi"), aes(x = displ, y = cty, label = model)) +
geom_point() +
geom_label()

ggplot(filter(mpg, manufacturer == "audi"), aes(x = displ, y = cty, label = model)) +
geom_point() +
geom_label_repel()

6. Themes
library(ggthemes)
diamond_plot + theme_economist() + scale_color_economist()

diamond_plot + theme_gdocs() + scale_color_gdocs()

diamond_plot + theme_solarized() + scale_color_solarized()

diamond_plot + theme_wsj() + scale_color_wsj()

7. Facets
library(nycflights13)
library(lubridate)
library(scales)
flights_df <- flights %>%
sample_n(10000) %>%
mutate(dep_time2 = ymd_hm(paste("2000-01-01", hour, minute)))
delay_plot <- ggplot(flights_df, aes(x = dep_time2, y = arr_delay)) +
geom_point() +
geom_smooth(method = "lm") +
scale_x_datetime(labels = date_format("%H:%M"))
delay_plot

delay_plot + facet_wrap(~origin)

# see: http://stackoverflow.com/a/12104207/3048453
facet_labeller <- function(variable, value){
return(facet_names[value])
}
facet_names <- list(
"EWR" = "Newark Airport",
"JFK" = "JFK Airport",
"LGA" = "LaGuardia Airport"
)
delay_plot + facet_wrap(~origin, labeller = facet_labeller)

8. Maps
USA-map from the workshop
library(ggmap)
state_df <- map_data("state") %>% as_data_frame()
ggplot(state_df, aes(x = long, y = lat, group = group)) +
geom_polygon(fill = "white", color = "black") +
theme_map() + coord_map()

# include the population data
## gather the population data from library(datasets)
library(datasets)
stats_df <- tibble(
region = tolower(state.name),
long_center = state.center$x,
lat_center = state.center$y,
pop = state.x77[,"Population"],
lifeexp = state.x77[, "Life Exp"]
)
usa_df <- left_join(state_df, stats_df, by = "region")
labels_df <- usa_df %>% select(region, long_center, lat_center, group) %>% distinct()
ggplot() +
geom_polygon(data = usa_df, aes(x = long, y = lat, group = group, fill = pop/1000)) +
theme_map() +
coord_map() +
labs(title = "Population of US States",
subtitle = "As of 1970",
fill = "Population\nin Mio")

ggplot() +
geom_polygon(data = usa_df, aes(x = long, y = lat, group = group, fill = pop/1000)) +
theme_map() +
coord_map() +
labs(title = "Population of US States",
subtitle = "As of 1970",
fill = "Population\nin Mio") +
geom_label_repel(data = labels_df, aes(x = long_center, y = lat_center, label = region))

Earthquake map
earthquakes <- read_csv("http://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_month.csv")
world_df <-map_data("world") %>% as_data_frame()
ggplot(world_df, aes(x = long, y = lat, group = group)) +
geom_polygon() +
geom_point(data = earthquakes, aes(x = longitude, y = latitude, size = mag,
group = 1), color = "red", alpha = 0.1) +
scale_size_continuous(name = "Magnitude", range = c(0,4)) +
theme_map() +
labs(title = "Global Earthquakes", subtitle = "Last 30 days",
caption = "Source: USGS as of 2017-04-16") +
guides(size = guide_legend(override.aes = list(alpha = 1)))
